library(cluster)
## Warning: package 'cluster' was built under R version 3.4.4
library(mclust)
## Warning: package 'mclust' was built under R version 3.4.4
## Package 'mclust' version 5.4.1
## Type 'citation("mclust")' for citing this R package in publications.
library(lattice)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.2
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1.9000 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.5
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'purrr' was built under R version 3.4.2
## Warning: package 'forcats' was built under R version 3.4.3
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::map() masks mclust::map()
library(maptree)
## Loading required package: rpart
## Warning: package 'rpart' was built under R version 3.4.3
library(mice)
## Warning: package 'mice' was built under R version 3.4.4
##
## Attaching package: 'mice'
## The following object is masked from 'package:tidyr':
##
## complete
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(mclust)
library(mice)
library(nFactors)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: psych
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## The following object is masked from 'package:mclust':
##
## sim
## Loading required package: boot
## Warning: package 'boot' was built under R version 3.4.1
##
## Attaching package: 'boot'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:lattice':
##
## melanoma
##
## Attaching package: 'nFactors'
## The following object is masked from 'package:lattice':
##
## parallel
library(poLCA)
## Loading required package: scatterplot3d
## Warning: package 'scatterplot3d' was built under R version 3.4.4
library(Rtsne)
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
utility_histogramplotter <- function(x) {
lattice::histogram( ~ x)
}
seg.summ <- function(data, groups) {
aggregate(data, list(groups), function(x) mean(as.numeric(as.character(x))))
}
minmax <- function(x)
{
return((x- min(x)) /(max(x)-min(x)))
}
plot_bi_plots <- function(df, y,...) {
old.par = par()
par(mar=c(5,4,5,2))
on.exit(expr = 'par = old.par')
prcomp(df) %>% biplot(col = c('gray', 'red'), cex = .8, main = y, ...)
}
adjust_q13 <- function(df){
df %<>%
mutate(q13_visitfreq_social= (q13r1+q13r2+q13r3 +q13r11)/4,
q13_visitfreq_music = (q13r4+q13r7+q13r8 +q13r9)/4,
q13_visitfreq_video = (q13r5+q13r6+q13r10+q13r12)/4)
df[,!str_detect(names(df),'q13r')]
}
adjust_q24 <- function(df){
df %<>%
mutate(
q24_tech_posatt = (q24r1+q24r2+q24r3+q24r5+q24r6)/5,
q24_tech_enter = (q24r7+q24r8)/2,
q24_tech_comm = (q24r10+q24r11+q24r12)/3,
q24_tech_negatv = (q24r4+q24r9)/2
)
df[,!str_detect(names(df),'q24r')]
}
adjust_q25 <- function(df){
df %<>%
mutate(
q25_prsnlty_leader = (q25r1+q25r2+q25r3+q25r4)/4,
q25_prsnlty_risk = (q25r5+q25r7+q25r8)/3,
q25_prsnlty_drive = (q25r9+q25r10+q25r11+q25r12)/4,
q25_prsnlty_follower = q25r6
)
df[,!str_detect(names(df),'q25r')]
}
adjust_q26 <- function(df){
df %<>%
mutate(
q26_shopsavvy_bargain = (q26r3+q26r5)/2,
q26_shopsavvy_brands = (q26r18+q26r7+q26r13+q26r14+q26r15)/5,
q26_shopsavvy_earn2spend = (q26r13+q26r16+q26r4)/3,
q26_shopsavvy_applover = (q26r17+q26r12+q26r10+q26r8+q26r6+q26r9)/6,
q26_shopsavvy_children = q26r11,
)
df[,!str_detect(names(df),'q26r')]
}
adjust_q2 <- function(df){
df %<>%
mutate(
q2_apple = ifelse(q2r1==1|q2r2==1,1,0),
q2_andriod = q2r3,
q2_windows = q2r6,
q2_tablet = q2r8,
q2_other = ifelse(q2r4==1|q2r5==1|q2r7==1|q2r9==1,1,0)
)
df[,!str_detect(names(df),'q2r')]
}
adjust_q4 <- function(df){
df %<>%
mutate(
q4_use_music_apps =q4r1,
q4_use_tv_apps =ifelse(q4r2==1|q4r3==1|q4r4==1,1,0),
q4_use_game_apps =q4r5,
q4_use_social_apps =q4r6,
q4_use_news_apps =ifelse(q4r7==1|q4r9==1,1,0),
q4_use_shop_apps =q4r8,
q4_use_none_apps =q4r11
)
df[,!str_detect(names(df),'q4r')]
}
adjust_race <- function(df){
df %>%
mutate(
q54_white = ifelse(q54==1,1,0),
q54_black = ifelse(q54==2,1,0),
q54_asian = ifelse(q54==3,1,0),
q54_hawai = ifelse(q54==4,1,0),
q54_native = ifelse(q54==5,1,0),
q54_other = ifelse(q54==6,1,0),
q55_latino = ifelse(q55==1,1,0)
) %>%
dplyr::select(-q54,-q55)
}
adjust_gender <- function(df){
df %>%
mutate(q57 = ifelse(q57==1,1,0))
}
adjust_marital <- function(df){
df %>%
mutate(q49 = ifelse(q49==1,1,0))
}
adjust_q11 <- function(df){
#Q11 has artificial ordinality between #of apps increasing, and response==5 as "Dont know"
#This function resolves this problem
df %>%
mutate(q11 = ifelse(q11==5 | q11==6, 0, q11))
}
adjust_children <- function(df){
df %>%
dplyr::select(-q50r2_inftod,-q50r3_6_12,-q50r4_13_17,-q50r5_adult)
}
adjust_income <- function(df){
df %>%
dplyr::mutate(q56 = case_when(
q56 <= 4 ~ 1,
q56 >= 5 & q56 <=8 ~ 2,
q56 >= 9 & q56 <=11 ~ 3,
q56 >= 12 & q56 <= 13 ~ 4,
q56 >= 14 ~ 5
))
}
adjust_age <- function(df){
df %>%
dplyr::mutate(
q1 = case_when(
q1 <= 2 ~ 1,
q1 >= 3 & q1 <= 5 ~ 2,
q1 >= 6 & q1 <= 8 ~ 3,
q1 >= 9 & q1 <= 11 ~ 4,
q1 >= 12 ~ 5,
)
)
}
adjust_names <- function(df){
df %>%
dplyr::rename(
q1_age=q1,
q11_appnum = q11,
q12_freeapppc = q12,
q48_edu = q48,
q49_marital = q49,
q56_income = q56,
q57_mf = q57,
q50r1_nochild = q50r1,
q50r2_inftod = q50r2,
q50r3_6_12 = q50r3,
q50r4_13_17 = q50r4,
q50r5_adult = q50r5
)
}
load('../data/apphappyData.RData')
df_labs <- tbl_df(apphappy.3.labs.frame)
df_nums <- tbl_df(apphappy.3.num.frame)
dim(df_nums)
## [1] 1800 89
rm(apphappy.3.num.frame); rm(apphappy.3.labs.frame)
df_labs$q57 <- as.factor(df_labs$q57)
df_labs$caseID <- NULL
df_nums$caseID <- NULL
df_labs$q2r10 <- NULL
df_nums$q2r10 <- NULL
df_labs$q5r1 <- NULL
df_nums$q5r1 <- NULL
df_nums$q12[is.na(df_nums$q12)] <- 0
df_nums %>% xtabs(~q12+q11,.,addNA = T)
## q11
## q12 1 2 3 4 5 6
## 0 0 0 0 0 0 24
## 1 6 9 5 3 1 0
## 2 31 56 48 66 2 0
## 3 23 42 117 114 11 0
## 4 12 44 150 198 12 0
## 5 23 44 158 209 20 0
## 6 68 78 131 71 24 0
df_nums %>% xtabs(~q4r10+q11,.,addNA = T)
## q11
## q4r10 1 2 3 4 5 6
## 0 156 254 564 595 67 24
## 1 7 19 45 66 3 0
df_nums %>% xtabs(~q4r11+q11,.,addNA = T)
## q11
## q4r11 1 2 3 4 5 6
## 0 148 267 606 661 65 8
## 1 15 6 3 0 5 16
# RULE (A): IF q4r11=TRUE, it means you have no apps. So that means q11 has to equal 6, i.e. NONE. Data shows this is violated. Correcting using this rule.
df_nums$q11[df_nums$q4r11==TRUE] <- 6
df_nums %>% xtabs(~q4r11+q11,.,addNA = T)
## q11
## q4r11 1 2 3 4 5 6
## 0 148 267 606 661 65 8
## 1 0 0 0 0 0 45
Since q11 can be ordinal, “none” should equal 0 instead of 6 to preserve ordinality
# RULE (B): Set q11=6 to q11=0
df_nums$q11[df_nums$q11==6] <- 0
df_nums %>% xtabs(~q11,., addNA = T)
## q11
## 0 1 2 3 4 5
## 53 148 267 606 661 65
‘Dont know’ doesn’t add value. Perhaps these can be set to NA and then imputed using mice.
# RULE (C): Set q11 Dont know to NA
df_nums$q11[df_nums$q11==0] <- NA
df_nums %>% xtabs(~q11,., addNA = T)
## q11
## 1 2 3 4 5 <NA>
## 148 267 606 661 65 53
# RULE (D): All NA values for q12 set to 6, since I'm approximating that if you don't have any apps, might as well could as free?
df_nums$q12[is.na(df_nums$q12)] <- 6
df_nums %>% xtabs(~q12,., addNA = T)
## q12
## 0 1 2 3 4 5 6
## 24 24 203 307 416 454 372
map_int(df_nums,~sum(is.na(.x))) %>% sort() %>% tail(3) %>% barchart(main='Missing values')
micemap_df(df_nums,~as.factor(.x)) %>%
mice::mice(printFlag = T, m = 5, method = 'rf') -> miceFit
##
## iter imp variable
## 1 1 q11 q57
## 1 2 q11 q57
## 1 3 q11 q57
## 1 4 q11 q57
## 1 5 q11 q57
## 2 1 q11 q57
## 2 2 q11 q57
## 2 3 q11 q57
## 2 4 q11 q57
## 2 5 q11 q57
## 3 1 q11 q57
## 3 2 q11 q57
## 3 3 q11 q57
## 3 4 q11 q57
## 3 5 q11 q57
## 4 1 q11 q57
## 4 2 q11 q57
## 4 3 q11 q57
## 4 4 q11 q57
## 4 5 q11 q57
## 5 1 q11 q57
## 5 2 q11 q57
## 5 3 q11 q57
## 5 4 q11 q57
## 5 5 q11 q57
## Warning in as.POSIXlt.POSIXct(Sys.time()): unknown timezone 'default/
## America/Indiana/Indianapolis'
## Warning: Number of logged events: 25
df_nums <- tbl_df(mice::complete(miceFit))
# questions_to_consolidate <- c('q13','q24','q25','q26')
# plot_bi_plots(df_nums %>% dplyr::select(contains(questions_to_consolidate[1])),questions_to_consolidate[1])
# plot_bi_plots(df_nums %>% dplyr::select(contains(questions_to_consolidate[2])),questions_to_consolidate[2], xlim=c(0,.05), ylim=c(-0.01,0.002))
# plot_bi_plots(df_nums %>% dplyr::select(contains(questions_to_consolidate[3])),questions_to_consolidate[3])
# plot_bi_plots(df_nums %>% dplyr::select(contains(questions_to_consolidate[4])),questions_to_consolidate[4], xlim=c(0,0.05),ylim=c(-0.011,0.005))
df_nums <- map_df(df_nums,~as.numeric(as.character(.x)))
df_nums_adj <- df_nums %>%
adjust_q13() %>%
adjust_q24() %>%
adjust_q25() %>%
adjust_q26() %>%
adjust_q2() %>%
adjust_q4() %>%
adjust_race() %>%
adjust_gender() %>%
adjust_marital() %>%
adjust_income() %>%
adjust_age() %>%
adjust_q11() %>%
adjust_names() %>%
adjust_children()
## Warning: package 'bindrcpp' was built under R version 3.4.4
glimpse(df_nums_adj)
## Observations: 1,800
## Variables: 43
## $ q1_age <dbl> 1, 1, 3, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1...
## $ q11_appnum <dbl> 4, 3, 3, 4, 3, 4, 4, 3, 1, 2, 3, 2, 4...
## $ q12_freeapppc <dbl> 5, 5, 6, 4, 6, 3, 3, 6, 5, 2, 5, 6, 2...
## $ q48_edu <dbl> 3, 4, 3, 4, 4, 1, 3, 4, 3, 3, 4, 3, 4...
## $ q49_marital <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0...
## $ q50r1_nochild <dbl> 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1...
## $ q56_income <dbl> 2, 3, 4, 2, 2, 1, 1, 3, 2, 2, 2, 2, 4...
## $ q57_mf <dbl> 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0...
## $ q13_visitfreq_social <dbl> 2.50, 3.00, 3.25, 4.00, 2.50, 4.00, 2...
## $ q13_visitfreq_music <dbl> 3.75, 3.50, 2.75, 3.25, 3.50, 3.50, 4...
## $ q13_visitfreq_video <dbl> 2.50, 2.00, 2.50, 2.50, 1.75, 2.25, 3...
## $ q24_tech_posatt <dbl> 2.8, 2.8, 3.0, 3.0, 2.2, 3.4, 2.8, 2....
## $ q24_tech_enter <dbl> 2.0, 2.0, 1.5, 3.0, 2.5, 1.0, 2.5, 2....
## $ q24_tech_comm <dbl> 2.666667, 1.333333, 2.666667, 3.66666...
## $ q24_tech_negatv <dbl> 1.0, 3.0, 4.5, 3.0, 1.5, 4.5, 1.5, 4....
## $ q25_prsnlty_leader <dbl> 1.50, 2.75, 2.50, 2.00, 3.25, 3.00, 3...
## $ q25_prsnlty_risk <dbl> 1.666667, 2.333333, 3.333333, 2.66666...
## $ q25_prsnlty_drive <dbl> 2.75, 2.25, 2.00, 3.25, 3.00, 2.50, 2...
## $ q25_prsnlty_follower <dbl> 5, 5, 6, 5, 4, 4, 2, 5, 5, 2, 6, 5, 2...
## $ q26_shopsavvy_bargain <dbl> 2.5, 3.5, 3.0, 3.0, 3.0, 3.0, 3.5, 1....
## $ q26_shopsavvy_brands <dbl> 3.4, 3.2, 3.4, 3.0, 4.0, 3.2, 2.8, 3....
## $ q26_shopsavvy_earn2spend <dbl> 2.666667, 3.333333, 3.666667, 3.66666...
## $ q26_shopsavvy_applover <dbl> 4.166667, 3.000000, 4.166667, 2.16666...
## $ q26_shopsavvy_children <dbl> 6, 6, 4, 6, 3, 5, 5, 6, 5, 1, 6, 6, 1...
## $ q2_apple <dbl> 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1...
## $ q2_andriod <dbl> 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1...
## $ q2_windows <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1...
## $ q2_tablet <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1...
## $ q2_other <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1...
## $ q4_use_music_apps <dbl> 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1...
## $ q4_use_tv_apps <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1...
## $ q4_use_game_apps <dbl> 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1...
## $ q4_use_social_apps <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1...
## $ q4_use_news_apps <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1...
## $ q4_use_shop_apps <dbl> 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1...
## $ q4_use_none_apps <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ q54_white <dbl> 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1...
## $ q54_black <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0...
## $ q54_asian <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ q54_hawai <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ q54_native <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ q54_other <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0...
## $ q55_latino <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...
set1 <- c(
'q1_age',
# 'q11_appnum',
# 'q12_freeapppc',
'q48_edu',
'q49_marital',
'q50r1_nochild',
'q56_income',
# 'q57_mf',
# 'q13_visitfreq_social',
# 'q13_visitfreq_music',
# 'q13_visitfreq_video',
'q24_tech_posatt',
'q24_tech_enter',
'q24_tech_comm',
'q24_tech_negatv',
# 'q25_prsnlty_leader',
# 'q25_prsnlty_risk',
# 'q25_prsnlty_drive',
# 'q25_prsnlty_follower',
'q26_shopsavvy_bargain',
'q26_shopsavvy_brands',
'q26_shopsavvy_earn2spend',
'q26_shopsavvy_applover',
'q26_shopsavvy_children',
'q2_apple',
'q2_andriod',
'q2_windows',
# 'q2_tablet',
# 'q2_other',
# 'q4_use_music_apps',
# 'q4_use_tv_apps',
# 'q4_use_game_apps',
# 'q4_use_social_apps',
# 'q4_use_news_apps',
# 'q4_use_shop_apps',
# 'q4_use_none_apps',
'q54_white',
'q54_black',
'q54_asian',
'q54_hawai',
'q54_native',
# 'q54_other',
'q55_latino'
)
set2 <- c(
'q1_age',
# 'q11_appnum',
'q12_freeapppc',
'q48_edu',
'q49_marital',
'q50r1_nochild',
'q56_income',
# 'q57_mf',
'q13_visitfreq_social',
'q13_visitfreq_music',
'q13_visitfreq_video',
'q24_tech_posatt',
'q24_tech_enter',
'q24_tech_comm',
'q24_tech_negatv',
'q25_prsnlty_leader',
'q25_prsnlty_risk',
'q25_prsnlty_drive',
'q25_prsnlty_follower',
'q26_shopsavvy_bargain',
'q26_shopsavvy_brands',
'q26_shopsavvy_earn2spend',
'q26_shopsavvy_applover',
'q26_shopsavvy_children',
# 'q2_apple',
# 'q2_andriod',
# 'q2_windows',
# 'q2_tablet',
# 'q2_other',
# 'q4_use_music_apps',
# 'q4_use_tv_apps',
# 'q4_use_game_apps',
# 'q4_use_social_apps',
# 'q4_use_news_apps',
# 'q4_use_shop_apps',
# 'q4_use_none_apps',
'q54_white',
'q54_black',
'q54_asian',
'q54_hawai',
'q54_native',
# 'q54_other',
'q55_latino'
)
set3 <- c(
'q1_age',
'q11_appnum',
'q12_freeapppc',
'q48_edu',
'q49_marital',
'q50r1_nochild',
'q56_income',
'q57_mf',
# 'q13_visitfreq_social',
# 'q13_visitfreq_music',
# 'q13_visitfreq_video',
# 'q24_tech_posatt',
# 'q24_tech_enter',
# 'q24_tech_comm',
# 'q24_tech_negatv',
# 'q25_prsnlty_leader',
# 'q25_prsnlty_risk',
# 'q25_prsnlty_drive',
# 'q25_prsnlty_follower',
'q26_shopsavvy_bargain',
'q26_shopsavvy_brands',
'q26_shopsavvy_earn2spend',
'q26_shopsavvy_applover',
'q26_shopsavvy_children',
'q2_apple',
'q2_andriod',
'q2_windows',
'q2_tablet',
'q2_other',
'q4_use_music_apps',
'q4_use_tv_apps',
'q4_use_game_apps',
'q4_use_social_apps',
'q4_use_news_apps',
'q4_use_shop_apps',
'q4_use_none_apps'
# 'q54_white',
# 'q54_black',
# 'q54_asian',
# 'q54_hawai',
# 'q54_native',
# 'q54_other',
# 'q55_latino'
)
set4 <- c(
'q1_age',
# 'q11_appnum',
# 'q12_freeapppc',
# 'q48_edu',
'q49_marital',
'q50r1_nochild',
'q56_income',
'q57_mf',
'q13_visitfreq_social',
'q13_visitfreq_music',
'q13_visitfreq_video',
'q24_tech_posatt',
'q24_tech_enter',
'q24_tech_comm',
'q24_tech_negatv',
'q25_prsnlty_leader',
'q25_prsnlty_risk',
'q25_prsnlty_drive',
'q25_prsnlty_follower',
'q26_shopsavvy_bargain',
'q26_shopsavvy_brands',
'q26_shopsavvy_earn2spend',
'q26_shopsavvy_applover',
'q26_shopsavvy_children'
# 'q2_apple',
# 'q2_andriod',
# 'q2_windows',
# 'q2_tablet',
# 'q2_other',
# 'q4_use_music_apps',
# 'q4_use_tv_apps',
# 'q4_use_game_apps',
# 'q4_use_social_apps',
# 'q4_use_news_apps',
# 'q4_use_shop_apps',
# 'q4_use_none_apps'
# 'q54_white',
# 'q54_black',
# 'q54_asian',
# 'q54_hawai',
# 'q54_native',
# 'q54_other',
# 'q55_latino'
)
df_nums_adj_backup <- df_nums_adj
df_nums_adj <- df_nums_adj_backup
df_nums_adj <- df_nums_adj %>% dplyr::select(one_of(set4))
scaling_wanted <- T
minmax_wanted <- F
if (scaling_wanted) {
df_nums_adjscaled <- scale(df_nums_adj)
centers <- attributes(df_nums_adjscaled)[[3]]
spreads <- attributes(df_nums_adjscaled)[[4]]
df_nums_adjscaled <- tbl_df(df_nums_adjscaled)
}
if(minmax_wanted){
df_nums_adjscaled <- map_df(df_nums_adj, ~minmax(.x))
}
if(!scaling_wanted & !minmax_wanted){
df_nums_adjscaled <- df_nums_adj
}
head(df_nums_adjscaled)
df_nums_adjscaled %>% cor() %>% corrplot(method = 'shade',tl.col = 'black',tl.cex = .9, order = 'hclust', hclust.method = 'ward.D2')
# ordered=c(1:4,11,13:28)
# symm_bin=c(6:10,12,29:47)
# df_nums_adjscaled[ordered] <- map_df(df_nums_adjscaled[ordered],~as.ordered(.x))
# df_nums_adjscaled[symm_bin] <- map_df(df_nums_adjscaled[symm_bin],~as.factor(.x))
distMat <- daisy(df_nums_adjscaled)
## Warning in daisy(df_nums_adjscaled): binary variable(s) 2, 3, 5 treated as
## interval scaled
methods <- c('complete','average','ward.D','ward.D2','median','mcquitty','centroid')
method_vs_cop <- map_dbl(methods,~cor(cophenetic(hclust(d = distMat, method = .x)), distMat))
names(method_vs_cop) <- methods
barchart(sort(method_vs_cop))
k = 6
hclustFit <- hclust(d = distMat, method = 'ward.D')
plot(hclustFit, labels = F)
rect.hclust(hclustFit, k=k, border="red")
hclust_segments <- cutree(hclustFit, k = k)
table(hclust_segments)
## hclust_segments
## 1 2 3 4 5 6
## 302 432 154 380 164 368
clusplot(df_nums_adjscaled, hclust_segments, color=TRUE, shade=TRUE, labels=4, lines=0, main="HClust plot")
seg.summ(df_nums_adj, hclust_segments) -> centroids
centroids
cutFit <- cutree(hclustFit, k)
plot(silhouette(cutFit,distMat))
k <- 2:10
widths <- NULL
for(k_ in k){
hclustFit <- hclust(d = distMat, method = 'ward.D')
cutFit <- cutree(hclustFit, k_)
widths <- c(widths,mean(silhouette(cutFit,distMat)[,'sil_width']))
}
plot(k,widths,type='b')
cbind(k,widths)
## k widths
## [1,] 2 0.15040129
## [2,] 3 0.08247704
## [3,] 4 0.08266900
## [4,] 5 0.06700428
## [5,] 6 0.05121959
## [6,] 7 0.05488371
## [7,] 8 0.04953243
## [8,] 9 0.05085210
## [9,] 10 0.04773323
wssplot <- function(numsub, nc=15, seed=1111) {
wss <- (nrow(numsub)-1)*sum(apply(numsub,2,var))
for (i in 2:nc) {
set.seed(seed)
wss[i] <- sum(kmeans(numsub, centers=i, iter.max = 1e4)$withinss)}
plot(1:nc, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")}
wssplot(df_nums_adjscaled)
k <- 2:10
widths <- NULL
for(k_ in k){
clusterresults <- kmeans(x = df_nums_adjscaled, centers = k_, nstart = 30)
widths <- c(widths,mean(silhouette(clusterresults$cluster,distMat)[,'sil_width']))
}
plot(k,widths,type='b')
cbind(k,widths)
## k widths
## [1,] 2 0.15633032
## [2,] 3 0.10924977
## [3,] 4 0.10559810
## [4,] 5 0.10292595
## [5,] 6 0.09512481
## [6,] 7 0.09275647
## [7,] 8 0.08705368
## [8,] 9 0.08322454
## [9,] 10 0.08221007
# Using cluster centers from hclust:
seg.summ(df_nums_adjscaled, hclust_segments) -> kmeans_centroids
clusterresults <- kmeans(x = df_nums_adjscaled, centers = kmeans_centroids[,-1])
rsquare <- clusterresults$betweenss/clusterresults$totss
cat('\nWithin SS:',clusterresults$withinss,' Sizes:\n',clusterresults$size,'\nrsq:', rsquare)
##
## Within SS: 3432.662 5125.88 3631.431 5046.119 3485.899 3946.495 Sizes:
## 253 392 214 395 257 289
## rsq: 0.3470318
clusplot(df_nums_adjscaled, clusterresults$cluster, color=TRUE, shade=TRUE, labels=4, lines=0, main="K-means cluster plot")
plot(silhouette(clusterresults$cluster,distMat))
# Using number of clusters + random starts
k_kmeans = 6
clusterresults <- kmeans(x = df_nums_adjscaled, centers = k_kmeans)
rsquare <- clusterresults$betweenss/clusterresults$totss
cat('\nWithin SS:',clusterresults$withinss,' Sizes:\n',clusterresults$size,'\nrsq:', rsquare)
##
## Within SS: 4401.516 2878.69 5051.376 3682.317 4713.007 4087.528 Sizes:
## 341 222 380 270 323 264
## rsq: 0.3431686
clusplot(df_nums_adjscaled, clusterresults$cluster, color=TRUE, shade=TRUE, labels=4, lines=0, main="K-means cluster plot")
plot(silhouette(clusterresults$cluster,distMat))
seg.summ(df_nums_adj, clusterresults$cluster) -> segsummary_results
segsummary_results %>% write_csv(path = '../reports/kmeans_results.csv', col_names = T)
segsummary_results
kmeans_results <- df_nums_adj %>%
mutate(cluster = clusterresults$cluster)
purrr::map2(kmeans_results, names(kmeans_results),
~bwplot(cluster~.x, kmeans_results,
main=.y))
## $q1_age
##
## $q49_marital
##
## $q50r1_nochild
##
## $q56_income
##
## $q57_mf
##
## $q13_visitfreq_social
##
## $q13_visitfreq_music
##
## $q13_visitfreq_video
##
## $q24_tech_posatt
##
## $q24_tech_enter
##
## $q24_tech_comm
##
## $q24_tech_negatv
##
## $q25_prsnlty_leader
##
## $q25_prsnlty_risk
##
## $q25_prsnlty_drive
##
## $q25_prsnlty_follower
##
## $q26_shopsavvy_bargain
##
## $q26_shopsavvy_brands
##
## $q26_shopsavvy_earn2spend
##
## $q26_shopsavvy_applover
##
## $q26_shopsavvy_children
##
## $cluster
pamFits <- tibble(k_pam = 2:10)
pamFits$pamFits <- map(pamFits$k_pam, ~pam(df_nums_adjscaled, k = .x))
pamFits$sil_avg_width <- map_dbl(pamFits$pamFits,~.x$silinfo$avg.width)
pamFits
pamFits %>%
xyplot(sil_avg_width~as.factor(k_pam),.,type='b')
plot(pamFits$pamFits[[1]])
seg.summ(df_nums_adj, pamFits$pamFits[[1]]$clustering)
mclustFits <- tibble(mclust_clusters=2:8)
mclustFits$mclustFits <- map(mclustFits$mclust_clusters, ~Mclust(df_nums_adjscaled, G = .x))
mclustFits$bic <- map_dbl(mclustFits$mclustFits, ~.x[['bic']])
mclustFits$loglik <- map_dbl(mclustFits$mclustFits, ~.x[['loglik']])
mclustFits
mclustFits %>%
xyplot(bic+loglik~as.factor(mclust_clusters),., auto.key = T, type = 'b')
summary(mclustFits[[4,'mclustFits']])
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VII (spherical, varying volume) model with 5 components:
##
## log.likelihood n df BIC ICL
## -48849.07 1800 114 -98552.63 -98971.1
##
## Clustering table:
## 1 2 3 4 5
## 522 273 313 317 375
mclust_clusters <- mclustFits[[4,'mclustFits']]$classification
clusplot(df_nums_adj, mclust_clusters, color=TRUE, shade=TRUE, labels=4, lines=0, main="M Clust Plot, k = 4")
seg.summ(df_nums_adj, mclust_clusters)
mclust_distMat <- daisy(df_nums_adj)
## Warning in daisy(df_nums_adj): binary variable(s) 2, 3, 5 treated as
## interval scaled
plot(silhouette(mclust_clusters,mclust_distMat))